import os
import shutil
import subprocess
import sys
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from IPython.display import display_html
from IPython.core.display import HTML
from matplotlib.patches import PathPatch
from matplotlib.colors import Colormap
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.manifold import TSNE
from scipy.stats import probplot
# Indicates whether the notebook is running on Kaggle or not.
ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None
MODELS_PATH = Path("models")
MODELS_PATH.mkdir(exist_ok=True) # If directory exists, do nothing.
FONT_COLOR = "#0b1320"
BACKGROUND_COLOR = "#F6F5F5"
DF_CMAP: Colormap = sns.light_palette("#204254", as_cmap=True) # type: ignore
MY_RC = {
"axes.labelcolor": FONT_COLOR,
"axes.labelsize": 10,
"axes.labelpad": 15,
"axes.labelweight": "bold",
"axes.titlesize": 14,
"axes.titleweight": "bold",
"axes.titlepad": 15,
"axes.facecolor": BACKGROUND_COLOR,
"xtick.labelsize": 10,
"xtick.color": FONT_COLOR,
"ytick.labelsize": 10,
"ytick.color": FONT_COLOR,
"figure.titlesize": 14,
"figure.titleweight": "bold",
"figure.facecolor": BACKGROUND_COLOR,
"figure.edgecolor": BACKGROUND_COLOR,
"figure.dpi": 72, # Locally Seaborn uses 72, meanwhile Kaggle 96.
"font.size": 10,
"font.family": "Serif",
"text.color": FONT_COLOR,
}
sns.set_theme(rc=MY_RC)
def download_from_kaggle(competition):
command = "kaggle competitions download -c "
filepath = Path("data/" + competition + ".zip")
if not filepath.is_file():
subprocess.run((command + competition).split())
Path("data").mkdir(parents=True, exist_ok=True)
shutil.unpack_archive(competition + ".zip", "data")
shutil.move(competition + ".zip", "data")
def adjust_box_widths(g, factor, orient="v"):
"""
Adjust the widths/heights of a seaborn-generated boxplot.
"""
if orient not in ("v", "h"):
raise ValueError("The `orient` should be 'v' or 'h'.")
i = 1
if orient == "h":
i = 0
# iterating through Axes instances
for ax in g.axes:
# iterating through axes artists:
for c in ax.get_children():
# searching for PathPatches
if isinstance(c, PathPatch):
# getting current height/width of box:
p = c.get_path()
verts = p.vertices
verts_sub = verts[:-1]
min_ = np.min(verts_sub[:, i])
max_ = np.max(verts_sub[:, i])
mid_ = 0.5 * (min_ + max_)
half_ = 0.5 * (max_ - min_)
# setting new height of box
min_new_ = mid_ - factor * half_
max_new_ = mid_ + factor * half_
verts_sub[verts_sub[:, i] == min_, i] = min_new_
verts_sub[verts_sub[:, i] == max_, i] = max_new_
# setting new height/width of median line
if orient == "v":
for l in ax.lines:
if len(l.get_ydata()) == 2 and np.all(
l.get_ydata() == [min_, max_]
):
l.set_ydata([min_new_, max_new_])
elif orient == "h":
for l in ax.lines:
if len(l.get_xdata()) == 2 and np.all(
l.get_xdata() == [min_, max_]
):
l.set_xdata([min_new_, max_new_])
HTML(
"""
<style>
code {
background: rgba(42, 53, 125, 0.1) !important;
border-radius: 4px !important;
}
</style>
"""
)
train = pd.read_csv("train.csv", index_col="PassengerId")
test = pd.read_csv("test.csv", index_col="PassengerId")
train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 891 entries, 1 to 891 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 891 non-null int64 1 Pclass 891 non-null int64 2 Name 891 non-null object 3 Sex 891 non-null object 4 Age 714 non-null float64 5 SibSp 891 non-null int64 6 Parch 891 non-null int64 7 Ticket 891 non-null object 8 Fare 891 non-null float64 9 Cabin 204 non-null object 10 Embarked 889 non-null object dtypes: float64(2), int64(4), object(5) memory usage: 83.5+ KB
categorical = train.select_dtypes("object").columns
numerical = train.select_dtypes("number").columns
print("Categorical:".ljust(15), f"{list(categorical)}".ljust(60), len(categorical))
print("Numerical:".ljust(15), f"{list(numerical)}".ljust(60), len(numerical))
Categorical: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'] 5 Numerical: ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] 6
print("Train Dataset NaNs:")
print(train.isna().sum()[train.isna().sum() > 0])
print()
print("Test Dataset NaNs:")
print(test.isna().sum()[test.isna().sum() > 0])
Train Dataset NaNs: Age 177 Cabin 687 Embarked 2 dtype: int64 Test Dataset NaNs: Age 86 Fare 1 Cabin 327 dtype: int64
num_description = (
train.describe(percentiles=[0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99])
.drop("count")
.rename(index=str.title)
.style.background_gradient(DF_CMAP)
)
num_description
| Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|
| Mean | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| Std | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| Min | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 1% | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 5% | 0.000000 | 1.000000 | 4.000000 | 0.000000 | 0.000000 | 7.225000 |
| 25% | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| 95% | 1.000000 | 3.000000 | 56.000000 | 3.000000 | 2.000000 | 112.079150 |
| 99% | 1.000000 | 3.000000 | 65.870000 | 5.000000 | 4.000000 | 249.006220 |
| Max | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
sns.clustermap(
train.corr(),
linecolor=BACKGROUND_COLOR,
linewidth=10,
annot=True,
cmap=DF_CMAP,
tree_kws={"linewidths": 1.5, "color": "#141B4D"},
annot_kws={"fontsize": 12},
figsize=(11, 11),
)
plt.show()
grid = sns.PairGrid(train.drop("Survived", axis=1), diag_sharey=False)
grid.fig.set_facecolor(BACKGROUND_COLOR)
grid.fig.set_size_inches(11.7, 11.7)
grid.map_upper(sns.scatterplot, color="#394d5f", marker="+", s=10)
grid.map_diag(sns.histplot, color="#204254", bins=20)
grid.map_lower(sns.kdeplot, levels=10, color="#e8ba91")
plt.show()
survived_mask = train["Survived"] == 1
features = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(11.7, 15), tight_layout=True)
kde_kw = dict(fill=True, linewidth=2, alpha=0.2)
hist_kw = dict(
linewidth=2, alpha=0.75, element="step", fill=False, cumulative=True, stat="density"
)
for feature, ax in zip(features, axes):
full = train[feature]
survived = train.loc[survived_mask, feature]
non_survived = train.loc[~survived_mask, feature]
plt.sca(ax[0])
sns.histplot(x=full, color="#204254", stat="density", label="All", alpha=0.75)
plt.legend(loc="upper right")
plt.sca(ax[1])
sns.kdeplot(x=survived, color="#394d5f", label="1", **kde_kw)
sns.kdeplot(x=non_survived, color="#e8ba91", label="0", **kde_kw)
plt.ylabel("")
plt.legend(loc="upper right", title="Survived")
plt.sca(ax[2])
sns.histplot(x=survived, color="#394d5f", label="1", **hist_kw)
sns.histplot(x=non_survived, color="#e8ba91", label="0", **hist_kw)
plt.legend(loc="lower right", title="Survived")
plt.ylabel("")
plt.show()
names = {
"mean": "Mean Survival Rate",
"sum": "Survivors",
"count": "Group Size",
}
df1 = (
train.pivot_table(
values="Survived",
index=("Pclass", pd.qcut(train["Age"], 5)),
aggfunc=["mean", "sum", "count"],
margins=True,
margins_name="Total",
)
.rename(
columns=names,
)
.droplevel(level=1, axis="columns")
.style.background_gradient(DF_CMAP)
.set_table_attributes("style='display:inline'")
)
df2 = (
train.pivot_table(
values="Survived",
index="Pclass",
aggfunc=["mean", "sum", "count"],
margins=True,
margins_name="Total",
)
.rename(
columns=names,
)
.droplevel(level=1, axis="columns")
.style.background_gradient(DF_CMAP)
.set_table_attributes("style='display:inline'")
)
display_html(df1._repr_html_() + df2._repr_html_(), raw=True)
| Mean Survival Rate | Survivors | Group Size | ||
|---|---|---|---|---|
| Pclass | Age | |||
| 1 | (0.419, 19.0] | 0.809524 | 17 | 21 |
| (19.0, 25.0] | 0.761905 | 16 | 21 | |
| (25.0, 31.8] | 0.666667 | 16 | 24 | |
| (31.8, 41.0] | 0.777778 | 35 | 45 | |
| (41.0, 80.0] | 0.506667 | 38 | 75 | |
| 2 | (0.419, 19.0] | 0.742857 | 26 | 35 |
| (19.0, 25.0] | 0.400000 | 12 | 30 | |
| (25.0, 31.8] | 0.416667 | 15 | 36 | |
| (31.8, 41.0] | 0.461538 | 18 | 39 | |
| (41.0, 80.0] | 0.363636 | 12 | 33 | |
| 3 | (0.419, 19.0] | 0.333333 | 36 | 108 |
| (19.0, 25.0] | 0.197674 | 17 | 86 | |
| (25.0, 31.8] | 0.283582 | 19 | 67 | |
| (31.8, 41.0] | 0.166667 | 10 | 60 | |
| (41.0, 80.0] | 0.088235 | 3 | 34 | |
| Total | 0.383838 | 342 | 891 |
| Mean Survival Rate | Survivors | Group Size | |
|---|---|---|---|
| Pclass | |||
| 1 | 0.629630 | 136 | 216 |
| 2 | 0.472826 | 87 | 184 |
| 3 | 0.242363 | 119 | 491 |
| Total | 0.383838 | 342 | 891 |
df = train.copy()
df.loc[df["Fare"] > 500, "Fare"] = df["Fare"].median()
df[["Survived", "Sex", "Pclass"]] = df[["Survived", "Sex", "Pclass"]].astype("category")
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(11.7, 5), tight_layout=True)
palette = {0: "#e8ba91", 1: "#394d5f"}
medianprops = {"color": "#204254", "alpha": 1}
flierprops = {"marker": "x", "mec": "#204254", "mfc": "#204254"}
kw = {"medianprops": medianprops, "flierprops": flierprops, "palette": palette}
sns.boxplot(data=df, x="Age", y="Pclass", hue="Survived", ax=ax1, **kw)
sns.boxplot(data=df, x="Fare", y="Pclass", hue="Survived", ax=ax2, **kw)
for patch in np.r_[ax1.patches, ax2.patches]:
col = patch.get_facecolor()
patch.set_alpha(0.8)
ax1.legend("", frameon=False)
ax2.set_ylabel("")
ax2.set_yticks([])
adjust_box_widths(fig, 0.8, "v")
plt.show()
df = train.copy()
df["Sex"] = df["Sex"].str.title()
df.groupby("Sex", group_keys=False).apply(
lambda sex: sex.pivot_table(
values="Survived",
index=("Sex", "Embarked", "Pclass"),
aggfunc=["mean", "sum", "count"],
margins=True,
margins_name=f"{sex.name} Total",
)
.rename(
columns=names,
)
.droplevel(level=1, axis="columns")
).style.background_gradient(DF_CMAP)
| Mean Survival Rate | Survivors | Group Size | |||
|---|---|---|---|---|---|
| Sex | Embarked | Pclass | |||
| Female | C | 1 | 0.976744 | 42 | 43 |
| 2 | 1.000000 | 7 | 7 | ||
| 3 | 0.652174 | 15 | 23 | ||
| Q | 1 | 1.000000 | 1 | 1 | |
| 2 | 1.000000 | 2 | 2 | ||
| 3 | 0.727273 | 24 | 33 | ||
| S | 1 | 0.958333 | 46 | 48 | |
| 2 | 0.910448 | 61 | 67 | ||
| 3 | 0.375000 | 33 | 88 | ||
| Female Total | 0.740385 | 231 | 312 | ||
| Male | C | 1 | 0.404762 | 17 | 42 |
| 2 | 0.200000 | 2 | 10 | ||
| 3 | 0.232558 | 10 | 43 | ||
| Q | 1 | 0.000000 | 0 | 1 | |
| 2 | 0.000000 | 0 | 1 | ||
| 3 | 0.076923 | 3 | 39 | ||
| S | 1 | 0.354430 | 28 | 79 | |
| 2 | 0.154639 | 15 | 97 | ||
| 3 | 0.128302 | 34 | 265 | ||
| Male Total | 0.188908 | 109 | 577 |
df = train.copy()
df["Survived"] = df["Survived"].map({1: "Survived", 0: "Died"})
df["Sex"] = df["Sex"].map({"male": "Male", "female": "Female"})
fig = px.sunburst(
data_frame=df,
title="Passengers Onboard",
path=["Sex", "Survived"],
color_discrete_sequence=["#394d5f", "#e8ba91"],
height=640,
width=640,
)
fig.update_traces(
textinfo="label+percent parent",
insidetextorientation="horizontal",
marker_line_width=10,
marker_line_color=BACKGROUND_COLOR,
)
fig.update_layout(
font_color=FONT_COLOR,
title_font_size=18,
plot_bgcolor=BACKGROUND_COLOR,
paper_bgcolor=BACKGROUND_COLOR,
)
fig.show()
df = train.copy()
df["IsAlone"] = df.eval("SibSp + Parch") == 0
df.pivot_table(
values="Survived",
index="IsAlone",
aggfunc=["mean", "sum", "count"],
margins=True,
margins_name="Total",
).rename(columns=names,).droplevel(level=1, axis="columns").style.background_gradient(
DF_CMAP
)
| Mean Survival Rate | Survivors | Group Size | |
|---|---|---|---|
| IsAlone | |||
| False | 0.505650 | 179 | 354 |
| True | 0.303538 | 163 | 537 |
| Total | 0.383838 | 342 | 891 |
df["Survived"] = df["Survived"].map({1: "Survived", 0: "Died"})
df["IsAlone"] = df["IsAlone"].map({True: "Alone", False: "NotAlone"})
fig = px.sunburst(
data_frame=df,
title="Alone Passengers Onboard",
path=["IsAlone", "Survived"],
color_discrete_sequence=["#394d5f", "#e8ba91"],
height=640,
width=640,
)
fig.update_traces(
textinfo="label+percent parent",
insidetextorientation="horizontal",
marker_line_width=10,
marker_line_color=BACKGROUND_COLOR,
)
fig.update_layout(
font_color=FONT_COLOR,
title_font_size=18,
plot_bgcolor=BACKGROUND_COLOR,
paper_bgcolor=BACKGROUND_COLOR,
)
fig.show()
df = train.copy()
df["Title"] = df["Name"].str.extract(r" ([A-Za-z]+)\.")
unique_titles = np.setdiff1d(df["Title"], ["Mr", "Miss", "Mrs", "Master"]).tolist()
df["Title"] = df["Title"].replace(unique_titles, "Other")
titles_info = (
pd.crosstab(
df["Title"],
df["Survived"],
normalize="index",
)
.join(df["Title"].value_counts())
.rename(
columns={
0: "Death Rate",
1: "Survival Rate",
"Title": "Group Size",
}
)
)
titles_info.style.background_gradient(DF_CMAP)
| Death Rate | Survival Rate | Group Size | |
|---|---|---|---|
| Title | |||
| Master | 0.425000 | 0.575000 | 40 |
| Miss | 0.302198 | 0.697802 | 182 |
| Mr | 0.843327 | 0.156673 | 517 |
| Mrs | 0.208000 | 0.792000 | 125 |
| Other | 0.555556 | 0.444444 | 27 |
melted = titles_info.reset_index().melt(
id_vars="Title",
value_vars=["Death Rate", "Survival Rate"],
var_name="Rate",
value_name="Value",
)
plt.figure(figsize=(11, 6), tight_layout=True)
palette = {"Death Rate": "#e8ba91", "Survival Rate": "#394d5f"}
sns.barplot(data=melted, x="Title", y="Value", hue="Rate", palette=palette, alpha=0.8)
plt.axhline(df["Survived"].mean(), linewidth=2, color="#204254", linestyle="--")
plt.text(2.07, 0.41, "Mean Survival Rate", fontsize=11)
plt.legend(loc="upper left")
plt.show()
df = train.copy()
df["Cabin"] = df["Cabin"].str[0]
df.pivot_table(
values="Survived",
index=["Cabin", "Pclass"],
aggfunc=["mean", "sum", "count"],
margins=True,
margins_name="Total",
).rename(columns=names).droplevel(level=1, axis="columns").style.background_gradient(
DF_CMAP
)
| Mean Survival Rate | Survivors | Group Size | ||
|---|---|---|---|---|
| Cabin | Pclass | |||
| A | 1 | 0.466667 | 7 | 15 |
| B | 1 | 0.744681 | 35 | 47 |
| C | 1 | 0.593220 | 35 | 59 |
| D | 1 | 0.758621 | 22 | 29 |
| 2 | 0.750000 | 3 | 4 | |
| E | 1 | 0.720000 | 18 | 25 |
| 2 | 0.750000 | 3 | 4 | |
| 3 | 1.000000 | 3 | 3 | |
| F | 2 | 0.875000 | 7 | 8 |
| 3 | 0.200000 | 1 | 5 | |
| G | 3 | 0.500000 | 2 | 4 |
| T | 1 | 0.000000 | 0 | 1 |
| Total | 0.666667 | 136 | 204 |
plt.figure(figsize=(11, 6), tight_layout=True)
palette = {1: "#e8ba91", 2: "#394d5f", 3: "#204254"}
sns.barplot(
data=df,
x="Cabin",
y="Survived",
hue="Pclass",
alpha=0.8,
errorbar=None,
palette=palette,
)
plt.axhline(df["Survived"].mean(), linewidth=2, color="#204254", linestyle="--")
plt.text(6.20, 0.42, "Mean Survival Rate")
plt.legend(loc="upper left", title="Pclass")
plt.show()
tsne_preprocess = make_column_transformer(
(
make_pipeline(SimpleImputer(strategy="median"), StandardScaler()),
["Age"],
),
(
make_pipeline(
SimpleImputer(strategy="median"),
FunctionTransformer(func=np.log1p, inverse_func=np.expm1),
StandardScaler(),
),
["Fare"],
),
(
make_pipeline(SimpleImputer(strategy="most_frequent"), OrdinalEncoder()),
["Pclass", "Sex", "Embarked", "IsAlone"],
),
remainder="drop",
)
df = train.copy()
df["IsAlone"] = df.eval("SibSp + Parch == 0")
labels = train["Survived"].astype("category")
tsne = TSNE(n_components=3, random_state=42)
X_3d = tsne.fit_transform(tsne_preprocess.fit_transform(df))
X_3d = pd.DataFrame(X_3d, columns=["x1", "x2", "x3"], index=labels.index).join(labels)
X_3d.head()
| x1 | x2 | x3 | Survived | |
|---|---|---|---|---|
| PassengerId | ||||
| 1 | -6.790534 | -5.579925 | -1.292992 | 0 |
| 2 | 10.453622 | 0.112417 | 5.057104 | 1 |
| 3 | -8.081683 | -0.127755 | 5.887538 | 1 |
| 4 | 8.922507 | -1.792780 | -3.436335 | 1 |
| 5 | -3.929063 | 10.164669 | -0.584738 | 0 |
fig = px.scatter_3d(
data_frame=X_3d,
x="x1",
y="x2",
z="x3",
symbol="Survived",
symbol_sequence=["circle", "diamond"],
color="Survived",
color_discrete_sequence=["#e8ba91", "#394d5f"],
opacity=0.5,
height=740,
width=740,
title="Titanic Survivors - 3D projection with t-SNE",
)
fig.update_layout(
font_color=FONT_COLOR,
title_font_size=18,
plot_bgcolor=BACKGROUND_COLOR,
paper_bgcolor=BACKGROUND_COLOR,
)
fig.update_traces(marker_size=4)
fig.show()
# Étape 1: Importation des bibliothèques nécessaires
# Importation des bibliothèques de base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Importation des outils de machine learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Configuration pour l'affichage des graphiques
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
'Librairies importées avec succès !'
# Étape 2: Chargement et exploration initiale des données
# Chargement des données
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
submission_data = pd.read_csv("gender_submission.csv")
# Exploration initiale des données d'entraînement
train_head = train_data.head()
train_description = train_data.describe(include="all")
train_info = train_data.info()
train_head, train_description
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
( PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S ,
PassengerId Survived Pclass Name Sex \
count 891.000000 891.000000 891.000000 891 891
unique NaN NaN NaN 891 2
top NaN NaN NaN Braund, Mr. Owen Harris male
freq NaN NaN NaN 1 577
mean 446.000000 0.383838 2.308642 NaN NaN
std 257.353842 0.486592 0.836071 NaN NaN
min 1.000000 0.000000 1.000000 NaN NaN
25% 223.500000 0.000000 2.000000 NaN NaN
50% 446.000000 0.000000 3.000000 NaN NaN
75% 668.500000 1.000000 3.000000 NaN NaN
max 891.000000 1.000000 3.000000 NaN NaN
Age SibSp Parch Ticket Fare Cabin \
count 714.000000 891.000000 891.000000 891 891.000000 204
unique NaN NaN NaN 681 NaN 147
top NaN NaN NaN 347082 NaN B96 B98
freq NaN NaN NaN 7 NaN 4
mean 29.699118 0.523008 0.381594 NaN 32.204208 NaN
std 14.526497 1.102743 0.806057 NaN 49.693429 NaN
min 0.420000 0.000000 0.000000 NaN 0.000000 NaN
25% 20.125000 0.000000 0.000000 NaN 7.910400 NaN
50% 28.000000 0.000000 0.000000 NaN 14.454200 NaN
75% 38.000000 1.000000 0.000000 NaN 31.000000 NaN
max 80.000000 8.000000 6.000000 NaN 512.329200 NaN
Embarked
count 889
unique 3
top S
freq 644
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN )
# Étape 3: Analyse Exploratoire des Données (EDA)
# Visualisation de la distribution des survivants
plt.figure(figsize=(6, 5))
sns.countplot(x='Survived', data=train_data)
plt.title('Distribution des survivants')
plt.xlabel('Survivants')
plt.ylabel('Nombre de passagers')
plt.show()
# Visualisation de la distribution des survivants par sexe
plt.figure(figsize=(6, 5))
sns.countplot(x='Survived', hue='Sex', data=train_data)
plt.title('Distribution des survivants par sexe')
plt.xlabel('Survivants')
plt.ylabel('Nombre de passagers')
plt.legend(title='Sexe')
plt.show()
# Visualisation de la distribution des survivants par classe
plt.figure(figsize=(6, 5))
sns.countplot(x='Survived', hue='Pclass', data=train_data)
plt.title('Distribution des survivants par classe')
plt.xlabel('Survivants')
plt.ylabel('Nombre de passagers')
plt.legend(title='Classe')
plt.show()
# Visualisation de la distribution des âges
plt.figure(figsize=(10, 6))
sns.histplot(train_data['Age'].dropna(), kde=True, bins=30)
plt.title('Distribution des âges')
plt.xlabel('Âge')
plt.ylabel('Nombre de passagers')
plt.show()
# Visualisation de la distribution des tarifs
plt.figure(figsize=(10, 6))
sns.histplot(train_data['Fare'], kde=True, bins=40)
plt.title('Distribution des tarifs')
plt.xlabel('Tarif')
plt.ylabel('Nombre de passagers')
plt.show()
# Visualisation de la distribution du nombre de frères et sœurs/conjoints
plt.figure(figsize=(8, 6))
sns.countplot(x='SibSp', data=train_data)
plt.title('Distribution du nombre de frères et sœurs/conjoints')
plt.xlabel('Nombre de frères et sœurs/conjoints')
plt.ylabel('Nombre de passagers')
plt.show()
# Visualisation de la distribution du nombre de parents/enfants
plt.figure(figsize=(8, 6))
sns.countplot(x='Parch', data=train_data)
plt.title('Distribution du nombre de parents/enfants')
plt.xlabel('Nombre de parents/enfants')
plt.ylabel('Nombre de passagers')
plt.show()
# Étape 4: Prétraitement des données et ingénierie des caractéristiques
# Traitement des valeurs manquantes
# Remplissage des valeurs manquantes dans la colonne "Age"
age_median = train_data['Age'].median()
train_data['Age'].fillna(age_median, inplace=True)
# Remplissage des valeurs manquantes dans la colonne "Embarked" par le mode
embarked_mode = train_data['Embarked'].mode()[0]
train_data['Embarked'].fillna(embarked_mode, inplace=True)
# Suppression de la colonne "Cabin" car elle a trop de valeurs manquantes
train_data.drop('Cabin', axis=1, inplace=True)
# Ingénierie des caractéristiques
# Extraction du titre à partir du nom (M., Mlle, Mme, etc.)
train_data['Title'] = train_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
# Regroupement des titres moins courants sous la catégorie "Other"
rare_titles = ['Dr', 'Rev', 'Col', 'Major', 'Lady', 'Jonkheer', 'Don', 'Capt', 'Sir', 'Countess']
train_data['Title'] = train_data['Title'].replace(rare_titles, 'Other')
train_data['Title'] = train_data['Title'].replace('Mlle', 'Miss')
train_data['Title'] = train_data['Title'].replace('Ms', 'Miss')
train_data['Title'] = train_data['Title'].replace('Mme', 'Mrs')
# Création d'une nouvelle caractéristique "FamilySize" basée sur "SibSp" et "Parch"
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
# Suppression des colonnes inutiles
train_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)
# Encodage des variables catégorielles
label_encoders = {}
categorical_cols = ['Sex', 'Embarked', 'Title']
for col in categorical_cols:
le = LabelEncoder()
train_data[col] = le.fit_transform(train_data[col])
label_encoders[col] = le
# Vérification des données après prétraitement
train_data_head = train_data.head()
train_data_head
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | Title | FamilySize | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 22.0 | 1 | 0 | 7.2500 | 2 | 2 | 2 |
| 1 | 1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 | 0 | 3 | 2 |
| 2 | 1 | 3 | 0 | 26.0 | 0 | 0 | 7.9250 | 2 | 1 | 1 |
| 3 | 1 | 1 | 0 | 35.0 | 1 | 0 | 53.1000 | 2 | 3 | 2 |
| 4 | 0 | 3 | 1 | 35.0 | 0 | 0 | 8.0500 | 2 | 2 | 1 |
# Étape 5: Séparation des données en ensembles d'entraînement et de validation
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Vérification de la taille des ensembles d'entraînement et de validation
X_train.shape, X_val.shape, y_train.shape, y_val.shape
((712, 9), (179, 9), (712,), (179,))
# Étape 6: Choix du modèle de classification
clf = RandomForestClassifier(n_estimators=100, random_state=42)
# Étape 7: Entraînement du modèle
clf.fit(X_train, y_train)
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=42)
# Étape 8: Évaluation du modèle
y_pred = clf.predict(X_val)
# Calcul de la précision
accuracy = accuracy_score(y_val, y_pred)
# Rapport de classification et matrice de confusion
report = classification_report(y_val, y_pred)
confusion = confusion_matrix(y_val, y_pred)
accuracy, report, confusion
(0.8324022346368715,
' precision recall f1-score support\n\n 0 0.85 0.87 0.86 105\n 1 0.81 0.78 0.79 74\n\n accuracy 0.83 179\n macro avg 0.83 0.83 0.83 179\nweighted avg 0.83 0.83 0.83 179\n',
array([[91, 14],
[16, 58]], dtype=int64))
# Étape 9: Visualisation des résultats
# Visualisation de la matrice de confusion
plt.figure(figsize=(8, 6))
sns.heatmap(confusion, annot=True, fmt='g', cmap='Blues', cbar=False)
plt.xlabel('Prédictions')
plt.ylabel('Valeurs réelles')
plt.title('Matrice de confusion')
plt.xticks([0.5, 1.5], ['Non survécu', 'Survécu'])
plt.yticks([0.5, 1.5], ['Non survécu', 'Survécu'], rotation=0)
plt.show()
# Visualisation de l'importance des caractéristiques
importances = clf.feature_importances_
features = X_train.columns
indices = np.argsort(importances)
plt.figure(figsize=(10, 8))
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Importance')
plt.title('Importance des caractéristiques')
plt.show()